The goal of this notebook is to identify other clusters of mutations with same approach used to define the mutations in genome-1 and genome-2. Hopefully, these can be used to establish other clonal haplotypes on the background of genome-1 and genome-2.
## ==== File paths input data ==== ##
# Data from lofreq and varscan labeled with G1 and G2 mutations
labeled.data = "../../results/variants/genotyped_variants.csv"
# Import the sample data
sample.data = "../../config/samples.csv"
# Annotations
annotations.filepath = "../../config/annotations.csv"
## ==== File paths output data ==== ##
# Path to save the figures for lab meeting
output.path = "../../results/variants"
if (!dir.exists(output.path)) {
dir.create(file.path(output.path), showWarnings = FALSE)
}
# Read in the variants with G1, G2, and G1-1 labeled
labeled.df = read_csv(labeled.data, show_col_types = FALSE)
haplotypes.label = labeled.df %>%
select(SNP, Haplotype, Background) %>%
distinct()
# Expand the mutations to have a frequency for every tissue.
expanded.df = labeled.df %>%
select(SNP, Tissue, AF) %>%
pivot_wider(names_from = "Tissue", values_from = "AF", values_fill = 0) %>%
pivot_longer(cols = !SNP, names_to = "Tissue", values_to = "AF") %>%
left_join(., select(labeled.df, c("SNP", "Tissue", "DP")), by = c("SNP", "Tissue")) %>%
mutate(DP = if_else(is.na(DP), 0, DP)) %>%
left_join(., haplotypes.label, by = "SNP")
# Get the mean frequency of the major genomes
tissue.mean = labeled.df %>%
filter(Haplotype %in% c("genome-1", "genome-2")) %>%
group_by(Tissue, Haplotype) %>%
summarize(AF.mean = mean(AF, na.rm = TRUE),
SD = sd(AF, na.rm = TRUE),
N = n()) %>%
mutate(SE = SD / sqrt(N),
Lower.CI = qt(1 - (0.05 / 2), N - 1) * SE,
Upper.CI = qt(1 - (0.05 / 2), N - 1) * SE) %>%
rename("AF" = AF.mean)
# Tissue order ~ relative to position in the brain.
tissue_order = c("Frontal Cortex 1",
"Frontal Cortex 3",
"Frontal Cortex 2",
"Temporal Lobe",
"Parietal Lobe",
"Occipital Lobe",
"Hippocampus",
"Internal Capsule",
"Cerebellum",
"Cerebellum Nucleus",
"Midbrain",
"UBS",
"Brain Stem")
I’ll try to use the same haployping approach that I used to cluster the mutations for genome-1 and genome-2. To do this, I’ll break down the mutations into bins of frequency. I think this will work the best to get clusters of mutations.
cluster.snps <- function(list.of.snps, snp.df, n.clusters) {
# SNP as column and Allele frequency as row
frequency.by.snp = snp.df %>%
filter(SNP %in% list.of.snps) %>%
select(AF, SNP, Tissue) %>%
pivot_wider(names_from = SNP, values_from = AF, values_fill = 0) %>%
select(!Tissue)
# Calculate R between every pair of columns while handling NAs
snp.correlation = cor(frequency.by.snp, use="pairwise.complete.obs")
# Convert to distance (positive corr is close to 0)
snp.dist = as.dist(1 - snp.correlation)
# Create k-medoids clustering with n clusters
snp.kmedoids = pam(snp.dist, n.clusters)
kclusters = snp.kmedoids$cluster
# Convert to a data frame
kclusters.df = data.frame(SNP = names(kclusters), cluster = kclusters)
# Assign the clusters to the original data frame
kmedoids.SNPs = snp.df %>%
filter(SNP %in% list.of.snps) %>%
left_join(., kclusters.df, by = "SNP") %>%
mutate(cluster = if_else(is.na(cluster), "no cluster", as.character(cluster)))
n.clusters.per.snp = kmedoids.SNPs %>%
select(SNP, cluster) %>%
distinct() %>%
group_by(cluster) %>%
count() %>%
mutate(cluster_size = paste0(cluster, " (", n, " snps in cluster)"))
return(left_join(kmedoids.SNPs, n.clusters.per.snp, by = "cluster"))
}
Can I link the SNPs that reach reasonably high frequency in some samples? I define these as reaching 25% or more in at least one tissue. However, I’m leaving out the mutations that are fixed in 10 or more samples - these should probably be included in the SSPE consensus (this will eventually be amended).
# Don't do SNPs that fixed (>95%) in more than 10/13 samples - do these separate
# These should probably be put into in the reference.
fixed.in.more.than.ten.snps = expanded.df %>%
filter(Haplotype == "subclonal") %>%
filter(AF >= .95) %>%
count(SNP) %>%
filter(n >= 10) %>%
pull(SNP)
# SNPs that are above 25% at some point in some tissue
twentyfive.percent.or.more.snps = expanded.df %>%
filter(Haplotype == "subclonal") %>% # Not including the mutations we already haplotyped
filter(AF >= 0.25) %>%
filter(!SNP %in% fixed.in.more.than.ten.snps) %>% # And not including the 'fixed' mutations missed in the reference
pull(SNP) %>%
unique(.)
# Cluster the SNPs
twentyfive.percent.or.more.clusters.df = cluster.snps(list.of.snps = twentyfive.percent.or.more.snps,
snp.df = expanded.df,
n.clusters = 20)
# Mean of each tissue ~ for plotting
twentyfive.percent.or.more.clusters.mean = twentyfive.percent.or.more.clusters.df %>%
group_by(Tissue, cluster, cluster_size, n) %>%
summarize(AF = mean(AF))
## `summarise()` has grouped output by 'Tissue', 'cluster', 'cluster_size'. You can override using the `.groups` argument.
# Plot the clusters
twentyfive.percent.or.more.clusters.df %>%
ggplot(aes(x = reorder(Tissue, -AF), y = AF)) +
geom_line(aes(group = SNP)) +
geom_line(data = twentyfive.percent.or.more.clusters.mean, aes(x = Tissue, y = AF, group = 1, col = (cluster)), size = 1) +
geom_ribbon(data = tissue.mean, aes(x=Tissue, ymin=AF-SD, ymax=AF+SD, group = Haplotype, fill = Haplotype), alpha=0.2, colour = NA) +
facet_wrap(~cluster_size) +
scale_fill_manual(values=c("#424ef5", "#cf1919")) +
xlab("Tissue") +
ylab("Allele Frequency") +
labs(fill="Haplotype", col = "Cluster") +
theme_bw(20) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme(strip.text.x = element_text(size = 12))
Some clusters have only a single mutation in them. Clearly, some of these belong to genome-1 or genome-2, but are missing from a single tissue, perhaps due to low coverage in that particular tissue. Also, some of these clusters with one SNP might be consensus mutations or mutations on both backgrounds that need to be resolved separately.
twentyfive.percent.or.more.clusters.df %>%
filter(n == 1) %>% # clusters that have only a single SNP in them
ggplot(aes(x = reorder(Tissue, -AF), y = AF)) +
geom_line(aes(group = SNP)) +
# geom_line(data = twentyfive.percent.or.more.clusters.mean, aes(x = Tissue, y = AF, group = 1, col = (cluster)), size = 1) +
geom_ribbon(data = tissue.mean, aes(x=Tissue, ymin=AF-SD, ymax=AF+SD, group = Haplotype, fill = Haplotype), alpha=0.2, colour = NA) +
facet_wrap(~cluster_size) +
scale_fill_manual(values=c("#424ef5", "#cf1919")) +
xlab("Tissue") +
ylab("Allele Frequency") +
labs(fill="Haplotype", col = "Cluster") +
theme_bw(20) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme(strip.text.x = element_text(size = 12))
# Clusters that are clearly genome-1 (13 is likely genome-1-1)
genome.1.missing.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(8)) %>%
pull(SNP) %>%
unique(.)
genome.1.1.missing.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(13)) %>%
pull(SNP) %>%
unique(.)
# Clusters that are clearly genome-2
genome.2.missing.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(6)) %>%
pull(SNP) %>%
unique(.)
# Clusters that break the rules
maybe.in.both.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(11, 12)) %>%
pull(SNP) %>%
unique(.)
# Annotate the labeled data frame with what's possible here
updated.haplotype.df = labeled.df %>%
mutate(Background = case_when(SNP %in% genome.1.missing.snps ~ "genome-1",
SNP %in% genome.1.1.missing.snps ~ "genome-1",
SNP %in% genome.2.missing.snps ~ "genome-2",
SNP %in% maybe.in.both.snps ~ "both",
SNP %in% fixed.in.more.than.ten.snps ~ "fixed",
TRUE ~ Background)) %>%
mutate(Haplotype = case_when(SNP %in% genome.1.missing.snps ~ "genome-1",
SNP %in% genome.1.1.missing.snps ~ "genome-1-1",
SNP %in% genome.2.missing.snps ~ "genome-2",
SNP %in% maybe.in.both.snps ~ "both",
SNP %in% fixed.in.more.than.ten.snps ~ "fixed",
TRUE ~ Haplotype))
Now, lets look at the remainder of the clusters of SNPs. These are SNPs that weren’t genome-1/2 mutations missing from a tissue or SNPs likely on both backgrounds.
twentyfive.percent.or.more.clusters.df %>%
filter(!cluster %in% c("8", "13", "6", "11", "12")) %>%
ggplot(aes(x = factor(Tissue, levels = tissue_order), y = AF)) +
geom_line(data = filter(twentyfive.percent.or.more.clusters.df, !cluster %in% c("8", "13", "6", "11", "12")),
aes(x = factor(Tissue, levels = tissue_order), y = AF, group = SNP, col = (cluster)), size = 1) +
geom_ribbon(data = tissue.mean, aes(x=factor(Tissue, levels = tissue_order), ymin=AF-SD, ymax=AF+SD, group = Haplotype, fill = Haplotype), alpha=0.2, colour = NA) +
facet_wrap(~cluster_size) +
scale_fill_manual(values=c("#424ef5", "#cf1919")) +
xlab("Tissue") +
ylab("Allele Frequency") +
labs(fill="Haplotype", col = "Cluster") +
theme_bw(20) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme(strip.text.x = element_text(size = 12))
16 is probably the same as 1 but appears to be different because some SNPs are missing due to low coverage in the Cerebellum and Cerebellum Nucleus. Clusters 10 and 15 are likely fixed, just missing because of low coverage. We’ll call these fixed for now.
# Likely Fixed
probably.fixed.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(10, 15)) %>%
pull(SNP) %>%
unique(.)
# Genome-1
cluster.1.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(1, 16)) %>%
pull(SNP) %>%
unique(.)
# Genome-1
cluster.2.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(14)) %>%
pull(SNP) %>%
unique(.)
# Genome-1
cluster.3.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(18)) %>%
pull(SNP) %>%
unique(.)
# Genome-2
cluster.4.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(19)) %>%
pull(SNP) %>%
unique(.)
# Genome-2
cluster.5.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(2)) %>%
pull(SNP) %>%
unique(.)
# Genome-2
cluster.6.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(20)) %>%
pull(SNP) %>%
unique(.)
# Genome-2
cluster.7.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(5)) %>%
pull(SNP) %>%
unique(.)
# ?
cluster.8.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(7)) %>%
pull(SNP) %>%
unique(.)
# ?
cluster.9.snps = twentyfive.percent.or.more.clusters.df %>%
filter(cluster %in% c(4)) %>%
pull(SNP) %>%
unique(.)
# Annotate the labeled data frame with what's possible here
updated.haplotype.df = updated.haplotype.df %>%
mutate(Background = case_when(SNP %in% probably.fixed.snps ~ "fixed",
SNP %in% cluster.1.snps ~ "genome-1",
SNP %in% cluster.2.snps ~ "genome-1",
SNP %in% cluster.3.snps ~ "genome-1",
SNP %in% cluster.4.snps ~ "genome-2",
SNP %in% cluster.5.snps ~ "genome-2",
SNP %in% cluster.6.snps ~ "genome-2",
SNP %in% cluster.7.snps ~ "genome-2",
SNP %in% cluster.8.snps ~ "unknown",
SNP %in% cluster.9.snps ~ "unknown",
TRUE ~ Background)) %>%
mutate(Haplotype = case_when(SNP %in% probably.fixed.snps ~ "fixed",
SNP %in% cluster.1.snps ~ "cluster 1",
SNP %in% cluster.2.snps ~"cluster 2",
SNP %in% cluster.3.snps ~"cluster 3",
SNP %in% cluster.4.snps ~ "cluster 4",
SNP %in% cluster.5.snps ~ "cluster 5",
SNP %in% cluster.6.snps ~ "cluster 6",
SNP %in% cluster.7.snps ~ "cluster 7",
SNP %in% cluster.8.snps ~ "cluster 8",
SNP %in% cluster.9.snps ~ "cluster 9",
TRUE ~ Haplotype))
What clusters are left? There are three with a single mutation. None of these single mutations seem all that important. There are some mutations that are probably in the 3’ and 5’ UTR. The other mutation is probably synonymous, but it’s in P/V/C, so it could interrupt another reading frame. We’ll keep these in mind for later.
sublonal.so.far = updated.haplotype.df %>%
filter(Haplotype == "subclonal") %>%
pull(SNP)
twentyfive.percent.or.more.clusters.df %>%
filter(SNP %in% sublonal.so.far) %>%
ggplot(aes(x = factor(Tissue, levels = tissue_order), y = AF)) +
geom_line(data = filter(twentyfive.percent.or.more.clusters.df, SNP %in% sublonal.so.far),
aes(x = factor(Tissue, levels = tissue_order), y = AF, group = SNP, col = (cluster)), size = 1) +
geom_ribbon(data = tissue.mean, aes(x=factor(Tissue, levels = tissue_order), ymin=AF-SD, ymax=AF+SD, group = Haplotype, fill = Haplotype), alpha=0.2, colour = NA) +
facet_wrap(~cluster_size) +
scale_fill_manual(values=c("#424ef5", "#cf1919")) +
xlab("Tissue") +
ylab("Allele Frequency") +
labs(fill="Haplotype", col = "Cluster") +
theme_bw(20) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme(strip.text.x = element_text(size = 12))
Most of these will be challenging to haplotype. I’ll only try to specify the cluster that seem the most clear cut.
twentyfive.percent.or.fewer.snps = expanded.df %>%
filter(!SNP %in% twentyfive.percent.or.more.snps) %>%
filter(!SNP %in% fixed.in.more.than.ten.snps) %>%
filter(Haplotype == "subclonal") %>%
filter(AF < 0.25 & AF > 0.05) %>%
pull(SNP) %>%
unique(.)
# Cluster the SNPs
twentyfive.percent.or.fewer.clusters.df = cluster.snps(list.of.snps = twentyfive.percent.or.fewer.snps,
snp.df = expanded.df,
n.clusters = 30)
# Mean of each tissue
twentyfive.percent.or.fewer.clusters.mean = twentyfive.percent.or.fewer.clusters.df %>%
group_by(Tissue, cluster, cluster_size, n) %>%
summarize(AF = mean(AF))
## `summarise()` has grouped output by 'Tissue', 'cluster', 'cluster_size'. You can override using the `.groups` argument.
# Plot the clusters
twentyfive.percent.or.fewer.clusters.df %>%
filter(n > 1) %>%
ggplot(aes(x = reorder(Tissue, -AF), y = AF)) +
geom_line(aes(group = SNP)) +
geom_line(data = filter(twentyfive.percent.or.fewer.clusters.mean, n > 1), aes(x = Tissue, y = AF, group = 1, col = (cluster)), size = 1) +
geom_ribbon(data = tissue.mean, aes(x=Tissue, ymin=AF-SD, ymax=AF+SD, group = Haplotype, fill = Haplotype), alpha=0.2, colour = NA) +
facet_wrap(~cluster_size) +
scale_fill_manual(values=c("#424ef5", "#cf1919")) +
xlab("Tissue") +
ylab("Allele Frequency") +
labs(fill="Haplotype", col = "Cluster") +
theme_bw(20) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme(strip.text.x = element_text(size = 12))
promising.subclonal.clusters = c(10, 12, 14, 18, 2, 5)
twentyfive.percent.or.fewer.clusters.df %>%
filter(cluster %in% promising.subclonal.clusters) %>%
ggplot(aes(x = factor(Tissue, levels = tissue_order), y = AF)) +
geom_line(aes(group = SNP)) +
geom_line(data = filter(twentyfive.percent.or.fewer.clusters.mean, cluster %in% promising.subclonal.clusters),
aes(x = factor(Tissue, levels = tissue_order), y = AF, group = 1, col = (cluster)), size = 1) +
geom_ribbon(data = tissue.mean, aes(x=factor(Tissue, levels = tissue_order), ymin=AF-SD, ymax=AF+SD, group = Haplotype, fill = Haplotype), alpha=0.2, colour = NA) +
facet_wrap(~cluster_size) +
scale_fill_manual(values=c("#424ef5", "#cf1919")) +
xlab("Tissue") +
ylab("Allele Frequency") +
labs(fill="Haplotype", col = "Cluster") +
theme_bw(20) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme(strip.text.x = element_text(size = 12))
# Unknown
cluster.10.snps = twentyfive.percent.or.fewer.clusters.df %>%
filter(cluster %in% c(10)) %>%
pull(SNP) %>%
unique(.)
# Unknown
cluster.11.snps = twentyfive.percent.or.fewer.clusters.df %>%
filter(cluster %in% c(12)) %>%
pull(SNP) %>%
unique(.)
# Unknown
cluster.12.snps = twentyfive.percent.or.fewer.clusters.df %>%
filter(cluster %in% c(14)) %>%
pull(SNP) %>%
unique(.)
# Unknown
cluster.13.snps = twentyfive.percent.or.fewer.clusters.df %>%
filter(cluster %in% c(18)) %>%
pull(SNP) %>%
unique(.)
# Genome-1
cluster.14.snps = twentyfive.percent.or.fewer.clusters.df %>%
filter(cluster %in% c(2,5)) %>%
pull(SNP) %>%
unique(.)
# Annotate the labeled data frame with what's possible here
updated.haplotype.df = updated.haplotype.df %>%
mutate(Background = case_when(SNP %in% cluster.10.snps ~ "unknown",
SNP %in% cluster.11.snps ~ "unknown",
SNP %in% cluster.12.snps ~ "unknown",
SNP %in% cluster.13.snps ~ "unknown",
SNP %in% cluster.14.snps ~ "genome-1",
TRUE ~ Background)) %>%
mutate(Haplotype = case_when(SNP %in% cluster.10.snps ~ "cluster 10",
SNP %in% cluster.11.snps ~ "cluster 11",
SNP %in% cluster.12.snps ~ "cluster 12",
SNP %in% cluster.13.snps ~ "cluster 13",
SNP %in% cluster.14.snps ~ "cluster 14",
TRUE ~ Haplotype))
We were able to cluster a reasonable number of the high-frequency mutations. How many mutations haven’t been haplotyped yet?
# Still labeled as 'subclonal'
mutations.with.no.info = updated.haplotype.df %>%
filter(Background == 'subclonal') %>%
pull(SNP) %>%
unique()
print(paste("There are about", length(mutations.with.no.info), "that are still totally un-haplotyped."))
## [1] "There are about 343 that are still totally un-haplotyped."
# Never above 5% - nearly impossible to haplotype
mutations.never.above.5perc = updated.haplotype.df %>%
filter(Background == 'subclonal') %>%
group_by(SNP) %>%
summarise(Max = max(AF)) %>%
filter(Max <= 0.05) %>%
pull(SNP) %>%
unique()
# Mutations that have some info
mutations.with.info = updated.haplotype.df %>%
filter(Background != 'subclonal') %>%
pull(SNP) %>%
unique()
print(paste("There are ", length(mutations.with.info), "that have at been clustered."))
## [1] "There are 152 that have at been clustered."
# Mutations that are left, but are above 5% some of the time.
mutations.left.to.haplotype = mutations.with.no.info[which(!mutations.with.no.info %in% mutations.never.above.5perc)]
print(paste("There are still about", length(mutations.left.to.haplotype), "that can reasonably be haplotyped."))
## [1] "There are still about 158 that can reasonably be haplotyped."
# What's the distribution of the frequency of these mutations?
updated.haplotype.df %>%
filter(SNP %in% mutations.left.to.haplotype) %>%
ggplot(aes(x = AF)) +
geom_histogram(bins = 30) +
theme_bw()
Here are the frequency of all current haplotypes in the 13 tissue samples.
haplotypes.label = updated.haplotype.df %>%
select(SNP, Haplotype, Background) %>%
distinct()
haplotype.order = c("cluster 1",
"cluster 2",
"cluster 3",
"cluster 4",
"cluster 5",
"cluster 6",
"cluster 7",
"cluster 8",
"cluster 9",
"cluster 10",
"cluster 11",
"cluster 12",
"cluster 13",
"cluster 14",
"genome-1-1")
# Expand the mutations to have a frequency for every tissue.
expanded.df = updated.haplotype.df %>%
select(SNP, Tissue, AF) %>%
pivot_wider(names_from = "Tissue", values_from = "AF", values_fill = 0) %>%
pivot_longer(cols = !SNP, names_to = "Tissue", values_to = "AF") %>%
left_join(., select(labeled.df, c("SNP", "Tissue", "DP")), by = c("SNP", "Tissue")) %>%
mutate(DP = if_else(is.na(DP), 0, DP)) %>%
left_join(., haplotypes.label, by = "SNP")
# Get the mean frequency of the major genomes
tissue.mean = updated.haplotype.df %>%
filter(Haplotype %in% c("genome-1", "genome-2")) %>%
group_by(Tissue, Haplotype) %>%
summarize(AF.mean = mean(AF, na.rm = TRUE),
SD = sd(AF, na.rm = TRUE),
N = n()) %>%
mutate(SE = SD / sqrt(N),
Lower.CI = qt(1 - (0.05 / 2), N - 1) * SE,
Upper.CI = qt(1 - (0.05 / 2), N - 1) * SE) %>%
rename("AF" = AF.mean, "Genotype" = Haplotype)
haplotype.mean = expanded.df %>%
group_by(Tissue, Haplotype) %>%
summarize(AF = mean(AF))
expanded.df %>%
filter(!Haplotype %in% c("fixed", "both", "subclonal", "genome-1", "genome-2")) %>%
ggplot(aes(x = factor(Tissue, levels = tissue_order), y = AF)) +
geom_line(aes(group = SNP)) +
geom_line(data = filter(haplotype.mean, !Haplotype %in% c("fixed", "both", "subclonal", "genome-1", "genome-2")),
aes(x = factor(Tissue, levels = tissue_order), y = AF, group = 1, col = (Haplotype)), size = 1) +
geom_ribbon(data = tissue.mean, aes(x=factor(Tissue, levels = tissue_order), ymin=AF-SD, ymax=AF+SD, group = Genotype, fill = Genotype), alpha=0.2, colour = NA) +
facet_wrap(~factor(Haplotype, levels = haplotype.order)) +
scale_fill_manual(values=c("#424ef5", "#cf1919")) +
xlab("Tissue") +
ylab("Allele Frequency") +
labs(fill="Genotype", col = "Cluster") +
theme_bw(20) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme(strip.text.x = element_text(size = 12))
updated.haplotype.df = updated.haplotype.df %>%
mutate(Background = case_when(Haplotype == "cluster 1" ~ "genome-1",
Haplotype == "cluster 2" ~ "genome-1",
Haplotype == "cluster 3" ~ "genome-1",
Haplotype == "cluster 4" ~ "genome-1",
Haplotype == "cluster 5" ~ "genome-2",
Haplotype == "cluster 6" ~ "genome-2",
Haplotype == "cluster 7" ~ "genome-2",
Haplotype == "cluster 8" ~ "genome-2",
Haplotype == "cluster 9" ~ "genome-1",
Haplotype == "cluster 10" ~ "genome-2",
Haplotype == "cluster 11" ~ "genome-1",
Haplotype == "cluster 12" ~ "genome-1",
Haplotype == "cluster 13" ~ "genome-1",
Haplotype == "cluster 14" ~ "genome-1",
TRUE ~ Background))
head(updated.haplotype.df)
## # A tibble: 6 × 14
## POS REF ALT AF DP Effect Gene_Name AA_Change Accession Tissue
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 96 G A 0.0222 148637 <NA> N <NA> Temporal_… Tempor…
## 2 167 T C 0.027 189143 Synony… N Ile20Ile Temporal_… Tempor…
## 3 242 T C 0.0293 150213 Synony… N Ile45Ile Temporal_… Tempor…
## 4 366 T C 0.0899 185968 Synony… N Leu87Leu Temporal_… Tempor…
## 5 537 T C 0.906 196183 Missen… N Ser144Pro Temporal_… Tempor…
## 6 810 G A 0.923 171545 Missen… N Ala235Thr Temporal_… Tempor…
## # … with 4 more variables: SNP <chr>, Gene <chr>, Background <chr>,
## # Haplotype <chr>
write_csv(updated.haplotype.df, paste(output.path, "clustered_variants.csv", sep = "/"))